# Load required libraries
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggthemes)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
# Read the CSV file
race <- read_csv("https://raw.githubusercontent.com/reisanar/datasets/master/marathon_results_2017.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 26410 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Bib, Name, M/F, City, State, Country, 10K, 15K, 20K, Proj Time
## dbl (4): Age, Overall, Gender, Division
## time (8): 5K, Half, 25K, 30K, 35K, 40K, Pace, Official Time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
race
## # A tibble: 26,410 × 22
## Bib Name Age `M/F` City State Country `5K` `10K` `15K` `20K` Half
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <time> <chr> <chr> <chr> <time>
## 1 11 Kiru… 24 M Keri… <NA> KEN 15'25" 0:30… 0:45… 1:01… 01:04:35
## 2 17 Rupp… 30 M Port… OR USA 15'24" 0:30… 0:45… 1:01… 01:04:35
## 3 23 Osak… 25 M Mach… <NA> JPN 15'25" 0:30… 0:45… 1:01… 01:04:36
## 4 21 Biwo… 32 M Mamm… CA USA 15'25" 0:30… 0:45… 1:01… 01:04:45
## 5 9 Cheb… 31 M Mara… <NA> KEN 15'25" 0:30… 0:45… 1:01… 01:04:35
## 6 15 Abdi… 40 M Phoe… AZ USA 15'25" 0:30… 0:45… 1:01… 01:04:35
## 7 63 Maiy… 33 M Colo… CO USA 15'25" 0:30… 0:45… 1:01… 01:04:36
## 8 7 Sefi… 28 M Addi… <NA> ETH 15'24" 0:30… 0:46… 1:02… 01:06:04
## 9 18 Pusk… 27 M Euge… OR USA 15'24" 0:30… 0:45… 1:01… 01:04:53
## 10 20 Ward… 28 M Kays… UT USA 15'25" 0:30… 0:45… 1:01… 01:04:53
## # … with 26,400 more rows, and 10 more variables: `25K` <time>, `30K` <time>,
## # `35K` <time>, `40K` <time>, Pace <time>, `Proj Time` <chr>,
## # `Official Time` <time>, Overall <dbl>, Gender <dbl>, Division <dbl>
race <- race %>%
filter(Country =="USA") %>%
na.omit()
race <- subset(race, !(State %in% c("AA", "AE","AP","MH","GU","PR","VI","DC")))
colnames(race) <- gsub("\\s", "_", colnames(race))
race <- rename(race, genders = `M/F`)
avg_time <-race %>%
group_by(Age) %>%
summarise(avg_overall_time = mean(as.numeric(Official_Time), na.rm = TRUE),
avg_overall_time = avg_overall_time/3600)
avg_time
## # A tibble: 66 × 2
## Age avg_overall_time
## <dbl> <dbl>
## 1 18 4.30
## 2 19 4.30
## 3 20 3.97
## 4 21 3.84
## 5 22 3.97
## 6 23 3.92
## 7 24 3.98
## 8 25 3.91
## 9 26 3.83
## 10 27 3.94
## # … with 56 more rows
Here is where I group by age and find the average overall time for each age group.
avg_time_plot<-
ggplot(data = avg_time) +
geom_point(aes(x = Age, y = avg_overall_time,text = row.names(avg_overall_time))) +
scale_x_continuous(limits = c(18, NA), breaks = seq(18, max(avg_time$Age), by = 5)) +
labs(x = "Age", y = "Average Finish Time In Hours", title = "Average Finish Time Per Age Group For Runners In USA")+
theme(axis.text.x = element_text(angle = 25, hjust = 1))+
theme_light()
## Warning in geom_point(aes(x = Age, y = avg_overall_time, text =
## row.names(avg_overall_time))): Ignoring unknown aesthetics: text
avg_time_plot <- ggplotly(avg_time_plot)
htmlwidgets::saveWidget(avg_time_plot, "fancy_plot_avg_time_plot.html")
Here we can see that 28-38 year olds typically have the quickest marathon race times. It was interesting to see the 84 year old finisher! I can’t imagine running for 6 hours a 25 year old.
#ggsave("final.jpg", plot = avg_time_plot, width = 20, height = 9, dpi = 300)
After seeing this breakdown I wanted to see how different the average finish times were for males and females
avg_time_gender <- race %>%
group_by(Age,genders) %>%
summarise(avg_overall_time = mean(as.numeric(Official_Time), na.rm = TRUE),
avg_overall_time = avg_overall_time/3600,
)
## `summarise()` has grouped output by 'Age'. You can override using the `.groups`
## argument.
avg_time_gender_plot <- ggplot(data = avg_time_gender) +
geom_point(aes(x = Age, y = avg_overall_time,fill=genders)) +
facet_wrap(~ genders, ncol = 2) + # Adjust the ncol parameter
scale_x_continuous(limits = c(18, NA), breaks = seq(18, max(avg_time$Age), by = 5)) +
labs(label = FALSE,x = "Age", y = "Finish Time In Hours", title = "Average Finish Time For Female and Male Runners In USA") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x = element_text(margin = margin(t = 10, unit = "pt")),legend.position = "none")+
theme_light()+
scale_fill_manual(values = c("#FF00FF", "#4169E1")) # Specify your desired fill colors
ggplotly(avg_time_gender_plot)%>%
layout(width = 950) # Adjust the width value
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
This graph better visualizes the average finish time for males and females. It was interesting to see the majority of men finish under 4 hours between the age of 18-53 while most females average just over 4 hours
#ggsave("plot_gender_new.jpg", plot = avg_time_gender_plot, width = 15, height = 8, dpi = 300, type = "cairo")
library(sf)
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
# Load world shapefile from Natural Earth
# https://www.naturalearthdata.com/downloads/110m-cultural-vectors/
world_shapes <- read_sf("spatial-data/ne_50m_admin_1_states_provinces/ne_50m_admin_1_states_provinces.shp")
Since world_shapes and race both have state
names abbreviated i will rename state to match postal so i am able to
join the tables
race <- rename(race, postal = `State`)
There are other countries listed I just want to focus on the avg run time per state in the USA
avg_state <- race %>%
filter(Country =="USA") %>%
group_by(postal) %>%
summarise(avg_overall_time = mean(as.numeric(Official_Time), na.rm = TRUE),
avg_overall_time = avg_overall_time/3600,
) %>%
na.omit()
avg_state %>%
arrange()
## # A tibble: 50 × 2
## postal avg_overall_time
## <chr> <dbl>
## 1 AK 3.42
## 2 AL 3.85
## 3 AR 3.86
## 4 AZ 3.88
## 5 CA 3.90
## 6 CO 3.72
## 7 CT 3.93
## 8 DE 3.76
## 9 FL 4.00
## 10 GA 3.94
## # … with 40 more rows
Leaving out american territories and military bases
avg_state <- subset(avg_state, !(postal %in% c("AA", "AE","AP","MH","GU","PR","VI","DC","AK")))
Changing the shapes dataset to only have USA
usa_shapes <- world_shapes %>%
filter(admin =="United States of America")
#join the two
usa_shapes<-usa_shapes %>%
left_join(avg_state, by="postal")
usa_shapes <- usa_shapes %>%
select(postal,avg_overall_time) %>%
na.omit()
This map shows the average finish time for each state. It showed Alaska as the state with the quickest time.
map_interactive <- ggplot() +
geom_sf(data = usa_shapes, aes(fill = avg_overall_time, text = paste("State: ", postal)),
color = "white") +
scale_fill_gradient(low = "#66a0ff", high = "#4b595e") +
labs(title = "Average Finish Time For Each State", fill = "Finish Time In Hours", caption = "Finishers of the Boston Marathon of 2017") +
theme(legend.position = "bottom") +
theme_void()
## Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat =
## stat, : Ignoring unknown aesthetics: text
ggplotly(map_interactive)
This map shows how the average finishing time for each state. I wanted an interactive map so when you hovered over each state you know what state it is plus that’s specific’s state average finish time. This dataset was from the Boston Marathon I was kind of surprised Massachusetts didn’t have a quicker finish time. It was interesting to see how people from Alaska had the quickest average.
ggsave("map_interactive.jpg", plot = map_interactive, width = 8, height = 8, dpi = 300)
race_usa <- race %>%
filter(Country =="USA") %>%
group_by(postal,Age) %>%
summarise(avg_overall_time = mean(as.numeric(Official_Time), na.rm = TRUE),
avg_overall_time = avg_overall_time/3600,
) %>%
na.omit()
## `summarise()` has grouped output by 'postal'. You can override using the
## `.groups` argument.
race_usa <- subset(race_usa, !(postal %in% c("AA", "AE","AP","MH","GU","PR","VI","DC")))
race_usa
## # A tibble: 2,151 × 3
## # Groups: postal [50]
## postal Age avg_overall_time
## <chr> <dbl> <dbl>
## 1 AK 24 3.20
## 2 AK 26 2.68
## 3 AK 27 3.50
## 4 AK 28 3.34
## 5 AK 29 3.70
## 6 AK 30 2.64
## 7 AK 31 2.86
## 8 AK 32 2.67
## 9 AK 35 3.37
## 10 AK 36 3.11
## # … with 2,141 more rows
race_model <-ggplot(race_usa, aes(x = Age, y = avg_overall_time)) +
geom_point(alpha = 0.09) +
geom_smooth(method = "lm", se = FALSE, color = "#3B8DBD") +
scale_x_continuous(limits = c(18, NA), breaks = seq(18, max(avg_time$Age), by = 5)) +
labs(title = "Relationship between Age and Finish Time in Races for American Athleats", y = "Finish Time in Hours", caption = "Finishers of the Boston Marathon of 2017") +
theme(
plot.title = element_text(hjust = -.5, family = "Arial", face = "bold", size = 12),
axis.title.y = element_text(family = "Arial", size = 12),
axis.title.x = element_text(family = "Arial", size = 12,margin = margin(t = 10, unit = "pt"))
)+
theme_light()
race_model
## `geom_smooth()` using formula = 'y ~ x'
Since my earlier visualizations focused on age and race time, I decided
to model the relationship between age and race time for American racers.
Since I learned more about colors I changed the default blue to the
official blue Boston color
#3B8DBD
#ggsave("race_model.png", plot = race_model, dpi = 300)